In [1]:
import pandas as pd
import numpy as np
import nltk
import time
import csv
from bs4 import BeautifulSoup
from scipy.sparse import *
from scipy.io import mmwrite, mmread
from ast import literal_eval
import itertools
from itertools import izip
from taggerfunctions import *
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
uselesssymbols = ['. ','\n',"'",'\"','(',')',',',';',':','?','!','&','$']
def tokenizeWords(entry):
    entryselect = []
    soup = BeautifulSoup(entry)
    for tag in soup.find_all(["pre", "code", "a", "img"]):
        tag.decompose()
    entry = soup.get_text().encode('ascii', 'ignore')
    for symbol in uselesssymbols:
        entry = entry.replace(symbol, ' ')
    entrytok = nltk.word_tokenize(entry)
    entrytok = [w.lower() for w in entrytok]
    return tag_pos(entrytok)

In [3]:
def tag_pos(entrytok):
    entryselect = []
    entrytoktag = braubt_tagger.tag(entrytok)
    for tok, tag in entrytoktag:
        if tag not in ('VBP', 'CC', 'CD', 'RB', 'TO', 'VB', 'DT', 'IN', 'PRP', 'VBZ', 'WDT', '-NONE-'):
            try:
                tok_lemmatized = lemmatizer.lemmatize(tok, get_wordnet_pos(tag))
            except:
                tok_lemmatized = lemmatizer.lemmatize(tok)
            entryselect.append(tok_lemmatized)
    return entryselect

In [4]:
braubt_tagger = braubt_Tagger()

In [5]:
def getDict(fname):
    dictWords = {}
    with open(fname, 'r') as f:
        reader = csv.reader(f)
        dictWords = {rows[0]:literal_eval(rows[1]) for rows in reader}
    return pd.Series(dictWords)

def getSeries(fname, fromRow, toRow):
    dictWords = {}
    rowNum = 0
    with open(fname, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if fromRow <= rowNum and rowNum < toRow:
                dictWords[literal_eval(row[0])] = literal_eval(row[1])
            if rowNum == toRow:
                return pd.Series(dictWords)
            rowNum += 1
    return pd.Series(dictWords)

compute keyword frequencies


In [ ]:
reader = pd.read_csv("train.csv", verbose=True, chunksize=100000)
kwlist = []
for chunk in reader:
    for entry in chunk['Tags']:
        kwlist.extend(entry.split())
kwDist = nltk.FreqDist(kwlist)

In [14]:
kwDist_series = pd.Series(kwDist)

In [16]:
kwDist_series.to_csv("kwDist.csv")

In [17]:
kwDist = {}
with open('kwDist.csv', 'r') as f:
    reader = csv.reader(f)
    kwDist = {rows[0]:literal_eval(rows[1]) for rows in reader}
kwDist = pd.Series(kwDist)

In [21]:
kwDist.plot(50)


create dict for Keywords


In [ ]:
dictKeys = dict(zip(kwDist.keys(),range(0,len(kwDist))))

In [195]:
dictKeys = pd.Series(dictKeys)

In [108]:
dictKeys.to_csv("dictKeys.csv")
read in dictKeys:

In [5]:
dictKeys = {}
with open('dictKeys.csv', 'r') as f:
    reader = csv.reader(f)
    dictKeys = {rows[0]:literal_eval(rows[1]) for rows in reader}
dictKeys = pd.Series(dictKeys)

create inverse dict for Keywords:


In [6]:
invdictKeys = {}
for key, value in dictKeys.iteritems():
    invdictKeys[value] = key
invdictKeys = pd.Series(invdictKeys)

In [7]:
invdictKeys.to_csv("invdictKeys.csv")

compute the number of distinct words in set of titles and create dict for distinct Words


In [6]:
distinctWords = dict()
with open("tWordsnew.csv", "r") as f:
    reader = csv.reader(f)
    for rows in reader:
        entrylst = literal_eval(rows[1])
        for word in entrylst:
            if word not in distinctWords:
                distinctWords[word] = 1
            else:
                distinctWords[word] += 1

In [8]:
distinctWords2 = {key:value for key, value in distinctWords.iteritems() if value > 10}

In [10]:
dWords = pd.Series(distinctWords2)

In [12]:
dictWords = dict(zip(dWords.keys(),range(0,len(dWords))))

In [13]:
dictWords = pd.Series(dictWords)

In [14]:
dictWords.to_csv("dictWordsTitleNew.csv")

In [9]:
dictWords = {}
with open('dictWords.csv', 'r') as f:
    reader = csv.reader(f)
    dictWords = {rows[0]:literal_eval(rows[1]) for rows in reader}
dictWords = pd.Series(dictWords)

In [15]:
invdictWords = dict(zip(range(0,len(dWords)), dWords.keys()))
invdictWords = pd.Series(invdictWords)

In [10]:
invdictWords = {}
for key, value in dictWords.iteritems():
    invdictWords[value] = key
invdictWords = pd.Series(invdictWords)

In [16]:
invdictWords.to_csv("invdictWordsNew.csv")

create dictionary for distinct words for body of question:


In [70]:
distinctWordsBody = dict()
with open("bWords_0-638582.csv", "r") as f:
    reader = csv.reader(f)
    for rows in reader:
        entrylst = literal_eval(rows[1])
        for word in entrylst:
            if word not in distinctWordsBody:
                distinctWordsBody[word] = 1
            else:
                distinctWordsBody[word] += 1

In [71]:
distinctWordsBody2 = {key:value for key, value in distinctWordsBody.iteritems() if value > 10}
print len(distinctWordsBody2)
dWordsBody = pd.Series(distinctWordsBody2)
dictWordsBody = dict(zip(dWordsBody.keys(), range(0,len(dWordsBody))))
dictWordsBody = pd.Series(dictWordsBody)
dictWordsBody.to_csv("dictWordsBody_0-638582.csv")

invdictWordsBody = dict(zip(range(0,len(dWordsBody)), dWordsBody.keys()))
invdictWordsBody = pd.Series(dictWordsBody)
invdictWordsBody.to_csv("invdictWordsBody_0-638582.csv")


42169

In [8]:
fRange = ['0-638582', '600000-1200000', '1200000-1800000','1800000-2400000','2400000-3000000','3000000-3600000',
          '3600000-4200000','4200000-4800000','4800000-5400000','5400000-6000000','6000000-6034195']
dictWordsBody = dict()
for fileRange in fRange:
    fname = 'dictWordsBody_' + fileRange + '.csv'
    print fname
    with open(fname, "r") as f:
        reader = csv.reader(f)
        for rows in reader:
            if rows[0] not in dictWordsBody:
                dictWordsBody[rows[0]] = literal_eval(rows[1])


dictWordsBody_0-638582.csv
dictWordsBody_600000-1200000.csv
dictWordsBody_1200000-1800000.csv
dictWordsBody_1800000-2400000.csv
dictWordsBody_2400000-3000000.csv
dictWordsBody_3000000-3600000.csv
dictWordsBody_3600000-4200000.csv
dictWordsBody_4200000-4800000.csv
dictWordsBody_4800000-5400000.csv
dictWordsBody_5400000-6000000.csv
dictWordsBody_6000000-6034195.csv

In [10]:
dictWordsBody = pd.Series(dictWordsBody)

In [11]:
dictWordsBody.to_csv("dictWordsBodyFull.csv")

compute Co-occurrence matrix cooc as sparse lil matrix for title of train data:

careful!! you have to pass the elements to a lil_matrix but if you save it to a file it automatically converts it to a coo_matrix!!! -> save and load before continue using the matrix, otherwise the indices will change the next time!!


In [80]:
coocMat_lil = lil_matrix( (len(dictKeys),len(dictWords)) )

In [ ]:
lemmatizer = WordNetLemmatizer()
reader = pd.read_csv("train.csv", chunksize=100000)
count = 1
timeStart = time.time()
for chunk in reader:
    for tags,entry in zip(chunk['Tags'],chunk['Title']):
        entryselect = tokenizeWords(entry)
        gen = (word for word in entryselect if word in dictWords.keys())
        splitTags = tags.split()
        for word in gen:
                for tag in splitTags:
                    coocMat_lil[dictKeys[tag],dictWords[word]] += 1
        if count % 100000 == 0:
            print 'entry', count, 'finished'
            timeEnd = time.time()
            print 'time for 100000 loops:', timeEnd - timeStart
            timeStart = time.time()
        count += 1

In [82]:
mmwrite("coocMatTitleNew_coo.mtx", coocMat_lil)

In [4]:
coocMat_coo = mmread("coocMat_coo.mtx")

In [5]:
coocMat_csr = coocMat_coo.tocsr()
coocMat_csc = coocMat_coo.tocsc()

In [92]:
coocMat_coo = None
for bodyWords

In [7]:
from collections import Counter

In [ ]:
lemmatizer = WordNetLemmatizer()
count = 1
countChunk = 1
reader2 = pd.read_csv("bWords_5400000-6000000.csv", header=None, index_col=0, chunksize=100000)
reader = pd.read_csv("train.csv", chunksize=100000)
fnameDict = 'dictWordsBodyFull.csv'
dictWordsBody = getDict(fnameDict)
coocMatBody_lil = lil_matrix( (len(dictKeys),len(dictWordsBody)) )
timeStart = time.time()
dictforMat = Counter()
for chunk,chunk2 in izip(reader, reader2):
    for tags, entry in zip(chunk['Tags'],chunk2[1]):
        set_entry = set(literal_eval(entry))
        splitTags = tags.split()
        iterWords = list(w for w in set_entry if w in dictWordsBody.keys())
        for item in itertools.product(*[splitTags, iterWords]):
            #coocMatBody_lil[dictKeys[item[0]],dictWordsBody[item[1]]] += literal_eval(entry).count(item[1])
            dictforMat[dictKeys[item[0]],dictWordsBody[item[1]]] += literal_eval(entry).count(item[1])
        if count % 100000 == 0:
            print("entry {0:d} finished".format(count))
            print("time for 100000 loops: {0:.0f}s".format(time.time() - timeStart))
            timeStart = time.time()
        if count % 600000 == 0:
            for key,value in dictforMat.iteritems():
                coocMatBody_lil[key] = value
            fname = 'coocMatBody_coo_5400000-6000000.mtx'
            mmwrite(fname, coocMatBody_lil)
        count += 1
    countChunk += 1
    if countChunk > 6:
        break

for last chunk which is smaller than 600000:


In [11]:
lemmatizer = WordNetLemmatizer()
count = 1
countChunk = 1
reader2 = pd.read_csv("bWords_6000000-6034195.csv", header=None, index_col=0, chunksize=100000)
reader = pd.read_csv("train.csv", chunksize=100000)
fnameDict = 'dictWordsBodyFull.csv'
dictWordsBody = getDict(fnameDict)
coocMatBody_lil = lil_matrix( (len(dictKeys),len(dictWordsBody)) )
timeStart = time.time()
dictforMat = Counter()
for chunk,chunk2 in izip(reader, reader2):
    for tags, entry in zip(chunk['Tags'],chunk2[1]):
        set_entry = set(literal_eval(entry))
        splitTags = tags.split()
        iterWords = list(w for w in set_entry if w in dictWordsBody.keys())
        for item in itertools.product(*[splitTags, iterWords]):
            #coocMatBody_lil[dictKeys[item[0]],dictWordsBody[item[1]]] += literal_eval(entry).count(item[1])
            dictforMat[dictKeys[item[0]],dictWordsBody[item[1]]] += literal_eval(entry).count(item[1])
        if count % 10000 == 0:
            print("entry {0:d} finished".format(count))
            print("time for 10000 loops: {0:.0f}s".format(time.time() - timeStart))
            timeStart = time.time()
        if count % 34195 == 0:
            for key,value in dictforMat.iteritems():
                coocMatBody_lil[key] = value
            fname = 'coocMatBody_coo_6000000-6034195.mtx'
            mmwrite(fname, coocMatBody_lil)
        count += 1
    countChunk += 1
    if countChunk > 1:
        break
merge co-occurrence matrices of partial train data:

In [ ]:
fRange = ['0-600000', '600000-1200000', '1200000-1800000','1800000-2400000','2400000-3000000','3000000-3600000',
          '3600000-4200000','4200000-4800000','4800000-5400000','5400000-6000000','6000000-6034195']
dictWordsBody = getDict('dictWordsBodyFull.csv')
dictKeys = getDict('dictKeys.csv')
coocMatBodyFull_lil = lil_matrix( (len(dictKeys),len(dictWordsBody)) )

for r in fRange:
    fname = "coocMatBody_coo_" + r + ".mtx"
    print("adding matrix {0:s}".format(r))
    coocMatBodyPart_coo = mmread(fname)
    coocMatBodyPart_csr = coocMatBodyPart_coo.tocsr()
    rows,columns = coocMatBodyPart_csr.nonzero()
    for row, column in zip(rows,columns):
        coocMatBodyFull_lil[row,column] += coocMatBodyPart_csr[row,column]
fname = 'coocMatBodyFull_coo.mtx'
mmwrite(fname, coocMatBodyFull_lil)

In [6]:
coocMatBodyFull_coo = mmread("coocMatBodyFull_coo.mtx")

In [7]:
coocMatBodyFull_csr = coocMatBodyFull_coo.tocsr()

different method: load partial coocMatrices, convert them to csr format and then use overloaded addition


In [27]:
fname = "coocMatBody_coo_6000000-6034195.mtx"
coocMatBodyPart_coo = mmread(fname)

In [28]:
coocMatBodyPart_csr = coocMatBodyPart_coo.tocsr()

In [29]:
coocMatBodyFull_csr = coocMatBodyFull_csr + coocMatBodyPart_csr

In [30]:
mmwrite('coocMatBodyFull2_csr.mtx', coocMatBodyFull_csr)